from jupyterthemes import get_themes
import jupyterthemes as jt
from jupyterthemes.stylefx import set_nb_theme
set_nb_theme("monokai")
import plotly.express as px
import pandas as pd
import numpy as np
from itertools import chain
import plotly.io as pio
# pio.templates.default = "plotly"
import plotly.graph_objects as go
import glob
pd.options.mode.chained_assignment = None # default='warn'
pd.set_option("display.max_columns", 500)
pd.set_option("display.max_rows", 500)
from IPython.display import display, HTML
display(HTML("<style>.container { width:90% !important; }</style>"))
When we watch highlights of tennis, we are shown very long points as demonstrations of the incredible skill.
However the fact is that the majority of points are over quickly.
This isn't just true for the pros, its true for amateurs too.
The below chart shows that the MAJORITY of points are over within 4 shots.
This means you only have 2 of your own shots to win the majority of points.
df= pd.read_csv('D:/OneDrive/DataSci/Tennis/04_Analysis/NewVisualisations/data\\SummaryStats_250313_JJO_JR.csv')
rl = df[df.Label_0 == "RallyLen_Breakdown"]
rl["RallyLengths"] = np.where(rl.Label.str.contains("3to4"), "3_to_4_Shots",
np.where(rl.Label.str.contains("over4"), "5+Shots", "1_to_2_Shots"))
rl2 = rl.groupby("RallyLengths")["Frequency"].sum().reset_index()
rl2["ProportionOfRallyLengths"] = rl2.Frequency / rl2.Frequency.sum()
rl2["RallyBreakdown"] = ""
fig = px.bar(rl2, x = "ProportionOfRallyLengths", y = "RallyBreakdown", orientation = "h", color = "RallyLengths",
title = "<b>The Majority of Tennis Points are Over After Your 2nd Shot</b>")
fig.update_xaxes(tickformat=".0%")
This is a big insight. Now I know points are pretty short, so serving and returning are important.
But do I know specifically what I need to work on here?
Is it my 1st or 2nd serve, is it my backhand?
At this point we don't know, so we need to get deeper.
Let's breakdown the serve part of our game.
def read_n_sunburst(path, sunb_title):
"""Acts like a main function:
Function should take the functions below and feed into each other,
so only need 1 call.
Reads in data splits the data based on 1st & 2nd Serves, preps it for sunbursts.
Displays the sunbursts"""
#read the data in
d_in = pd.read_csv(path)
#create the variants
sunb_df_var1 = plusdata(d_in, "Serve_First_", "<b>1st Serve+:</b>")
sunb_df_var2 = plusdata(d_in, "Serve_Second_", "<b>2nd Serve+</b>:")
#output the variants
fig_out =compare_sunbursts(sunb_df_var1, sunb_df_var2, sunb_title)
return
def plusdata(d_in, typ, name):
'''This sources the data needed and
turns it into the output needed for the
sunburst charts'''
splus = d_in
splus = d_in[(d_in.Label_0 == "Start_SplitBy_Serve_Type_Part_OutCome_Shot") & (d_in.Label.str.contains(typ))][["Label","Frequency"]]
#need to create values
splus["Outcome"] = np.where(splus.Label.str.contains("Won"),"Won", np.where(splus.Label.str.contains("Lost"),"Lost", "Rally"))
splus["Shot"] = splus.Label.str.split("_").str[-1]
splus["Proportion"] = round(splus.Frequency / splus.Frequency.sum(),2)
splus["Outcome+"] = np.where(splus.Outcome == "Rally", "Point continues to Rally", splus.Outcome + " within 2 shots")
pts = splus.Frequency.sum()
n_in = f"{name} <br>{pts} points"
#create data set that needs to feed into the sundf
serveplus_points = splus.Proportion.sum()
serveonly = splus[splus.Shot == "Serve"].Proportion.sum()
FH = splus[splus.Shot == "FH"].Proportion.sum()
BH = splus[splus.Shot == "BH"].Proportion.sum()
serve_won = splus[(splus.Shot == "Serve") & (splus.Outcome == "Won")].Proportion.sum()
FH_won = splus[(splus.Shot == "FH") & (splus.Outcome == "Won")].Proportion.sum()
FH_lost = splus[(splus.Shot == "FH") & (splus.Outcome == "Lost")].Proportion.sum()
BH_won = splus[(splus.Shot == "BH") & (splus.Outcome == "Won")].Proportion.sum()
BH_lost = splus[(splus.Shot == "BH") & (splus.Outcome == "Lost")].Proportion.sum()
sunb_df = pd.DataFrame({
"labels":[ n_in, "Serve Only", "+Forehand", "+Backhand", "Won with Serve", "Won with FH", "Lost with FH", "Won with BH", "Lost with BH"],
"parents" : ["", n_in, n_in, n_in, "Serve Only", "+Forehand", "+Forehand", "+Backhand", "+Backhand" ],
"values": [ serveplus_points, serveonly, FH, BH, serve_won, FH_won, FH_lost, BH_won, BH_lost]})
return sunb_df
def gen_sunburst_go(d_in, position):
f = go.Sunburst(
labels= d_in["labels"],
parents = d_in["parents"],
values= d_in["values"],
branchvalues="total",
domain=dict(column= position),
sort = False
)
return f
def compare_sunbursts(in_1, in_2, titel):
'''Creates sunburst charts for 1st & 2nd serves to compare'''
fig = go.Figure()
fig.add_trace(gen_sunburst_go(in_1, 0))
fig.add_trace(gen_sunburst_go(in_2, 1))
fig.update_layout(
grid= dict(columns=2, rows=1),
margin = dict(t=0, l=0, r=0, b=0),
title=go.layout.Title(
text= titel,
xref="paper",
x=0.5,
y=0.9
))
fig.show()
read_n_sunburst("D:/OneDrive/DataSci/Tennis/04_Analysis/NewVisualisations/data\\SummaryStats_250313_JJO_JR.csv","<b>Serve+ Outcomes<br>Split by 1st & 2nd Serves</b>")
These charts breakdown the first 2 shots when we serve and we see the difference between 1st and 2nd serve outcomes.
Lets focus on the 1st serve to begin with. 16% of points are won directly with serve. This is a little low so needs some work.
57% of 1st serve points played use a serve & then a Forehand. This is good but is a little low as we want to be maximising the opportunity to attack and the forehand is the primary weapon for this.
This ties in with the serve only percentage being a little low.
This tells us that the serve is a little too weak and we cannot get the outcomes we need here.
What is also telling is what happens when we play a serve + Forehand.
Of the 57% of points played, we win 14% directly and lose 10% directly.
The remaining 33% continue to rally - ie points where 3 or more shots are needed.
This shows that the serve + forehand really needs to be worked on.